#### Prepare the data
#### Jonne Kamphorst
#### February 2021


#### Workflow for  data:
## (i) Clean the basic data from a frame for part 2 and a frame for part 1
## (i) Clean data in excel: id column first, take out all covariates, only selected and F columns
## (ii) run the hainmueller code to turn into the right dataframe
## (iii) change the column names so that you can then make it an even longer dataframe
# Perhaps I need to code first  whether respondent and politician agree and such
## (iv) add covariates back in via respondent id, task, and profile,after turning this
#       into wide using a separate dataframe
## (v) merge data from first and second parts of the survey together



#### Load libraries ####
library(dplyr)
library(tidyverse)
library(cjoint) #for creating dataframe
library(stringr) #for reg expr
library(gdata) #for keep
library(readxl)
library(stringr)
library(Hmisc)


#### Load data frames for normal questions ####
## Cleaning part 1:
# cleaned in excel: Remove 2nd and 3th row and remove qualtrics' standard columns
data_1 <- read.csv("data/part1.csv") #first part of the survey


# remove the columns that are for the pvda experiment. 
data_1 <- data_1 %>% select(-c(Q8.2_1, Q8.2_2, Q8.2_3, Q8.2_4, Q8.2_5,
                               Q8.2_6, Q8.2_8))

# Remove the attention checks
data_1 <- data_1 %>% select(-c(Q9.3,	`Q9.4_First.Click`, `Q9.4_Last.Click`, `Q9.4_Page.Submit`, `Q9.4_Click.Count`,
                               Q9.5, `Q9.6_First.Click`, 	`Q9.6_Last.Click`,	`Q9.6_Page.Submit`,	`Q9.6_Click.Count`,
                               Q9.7, 	`Q9.8_First.Click`,	`Q9.8_Last.Click`,	`Q9.8_Page.Submit`,
                               `Q9.8_Click.Count`,	Q9.10))

data_1 <- subset(data_1, gc==1) #drop the respondents who did not pass the attention checks or did not want to participate
data_1 <- data_1 %>% select(-gc) %>% distinct(rid, .keep_all=T) #drop column to that it doesn't merge on it later



## Cleaning part 2:
# Cleaned in excel: cleaned the same way as part 1 and also removed all conjoint questions and pvda questions
data_2 <- read_xlsx("data/part2.xlsx") #has to be excel for the time variable
data_2 <- subset(data_2, gc==1) #people who didn't finish (none)
data_2 <- data_2 %>% select(-gc)

# extract the credits left from the url.
data_2$credit.left <- as.numeric(str_extract(data_2$Q_URL, "(?i)(?<=credrem\\D)\\d+"))


# Create absolute values from QV variables
data_2$QV1_abs <- abs(data_2$QV1)
data_2$QV2_abs <- abs(data_2$QV2)
data_2$QV3_abs <- abs(data_2$QV3)
data_2$QV4_abs <- abs(data_2$QV4)
data_2$QV5_abs <- abs(data_2$QV5)
data_2$QV6_abs <- abs(data_2$QV6)

# rename the wrong qv columns
data_2 <- data_2 %>% rename(QV1_wrong = QV1,
                            QV2_wrong = QV2,
                            QV3_wrong = QV3,
                            QV4_wrong = QV4,
                            QV5_wrong = QV5,
                            QV6_wrong = QV6)




## Check how many credits people used (just to see if it lines up with the URL data)
data_2$credits <- (data_2$QV1_abs)^2 +
  (data_2$QV2_abs)^2 +
  (data_2$QV3_abs)^2 +
  (data_2$QV4_abs)^2 +
  (data_2$QV5_abs)^2 +
  (data_2$QV6_abs)^2

data_2$credrem_manual <- 36 - data_2$credits

non_matching <- subset(data_2, !(credrem_manual %in% data_2$credit.left)) #should be 0, checks if credits left manual and from url are the same
rm(non_matching)


#make a dataframe with faulty credit use
table(data_2$credrem_manual)

#mistake_rids <- subset(data_2, credrem_manual == 30)
#mistake_rids <- mistake_rids %>% select(rid)
#write.csv(mistake_rids, "mistake_rids.csv", row.names = F)
#rm(mistake_rids)

#and one for survey speeders
#survey_speeders <- subset(data_2, `Duration (in seconds)` < 120 | credrem_manual == 36)
#survey_speeders <- survey_speeders %>% select(rid)
#write.csv(survey_speeders, "survey_speeders.csv", row.names = F)



##### Add the QVSR data and match it with part 2 ####
# have to match because the user ids are not carried over due to bug in QVSR
QVSR <- read.csv("data/qvsr_data.csv") 

# Change day and time format so that it matches with Qualtrics
QVSR$date_wyear <- paste("2021/", QVSR$date, sep="") # add the year
QVSR$date_wyear_wtime <- paste(QVSR$date_wyear, QVSR$end.time, sep=" ") 
QVSR$date_wyear_wtime <- gsub("/", "-", QVSR$date_wyear_wtime) #replace dash
QVSR <- QVSR %>% select(-date_wyear) 



#create a version without the seconds to match on as seconds don't perfectly overlap
data_2$matching_time_mins <- round(data_2$StartDate, units = "mins")
data_2$matching_time_hours <- round(data_2$StartDate, units = "hour")
data_2$matching_time_day <- round(data_2$StartDate, units = "day")

QVSR$matching_time <- as.POSIXct(QVSR$date_wyear_wtime, tz="UTC")
QVSR$matching_time_mins <- round(QVSR$matching_time, units = "mins")
QVSR$matching_time_hours <- round(QVSR$matching_time, units = "hours")
QVSR$matching_time_day <- round(QVSR$matching_time, units = "day")

#drop QVSR responses from before the survey collection startes to make matching easier
QVSR <- subset(QVSR, QVSR$matching_time >= as.POSIXct("2021-03-10"))


#Calculate the wrong QV values for the matching to Qualtrics. WORKS
QVSR$QV1_wrong[QVSR$S1 == "QV1"] <- QVSR$QV1[QVSR$S1 == "QV1"]
QVSR$QV1_wrong[QVSR$S1 == "QV2"] <- QVSR$QV2[QVSR$S1 == "QV2"]
QVSR$QV1_wrong[QVSR$S1 == "QV3"] <- QVSR$QV3[QVSR$S1 == "QV3"]
QVSR$QV1_wrong[QVSR$S1 == "QV4"] <- QVSR$QV4[QVSR$S1 == "QV4"]
QVSR$QV1_wrong[QVSR$S1 == "QV5"] <- QVSR$QV5[QVSR$S1 == "QV5"]
QVSR$QV1_wrong[QVSR$S1 == "QV6"] <- QVSR$QV6[QVSR$S1 == "QV6"]

QVSR$QV2_wrong[QVSR$S2 == "QV1"] <- QVSR$QV1[QVSR$S2 == "QV1"]
QVSR$QV2_wrong[QVSR$S2 == "QV2"] <- QVSR$QV2[QVSR$S2 == "QV2"]
QVSR$QV2_wrong[QVSR$S2 == "QV3"] <- QVSR$QV3[QVSR$S2 == "QV3"]
QVSR$QV2_wrong[QVSR$S2 == "QV4"] <- QVSR$QV4[QVSR$S2 == "QV4"]
QVSR$QV2_wrong[QVSR$S2 == "QV5"] <- QVSR$QV5[QVSR$S2 == "QV5"]
QVSR$QV2_wrong[QVSR$S2 == "QV6"] <- QVSR$QV6[QVSR$S2 == "QV6"]

QVSR$QV3_wrong[QVSR$S3 == "QV1"] <- QVSR$QV1[QVSR$S3 == "QV1"]
QVSR$QV3_wrong[QVSR$S3 == "QV2"] <- QVSR$QV2[QVSR$S3 == "QV2"]
QVSR$QV3_wrong[QVSR$S3 == "QV3"] <- QVSR$QV3[QVSR$S3 == "QV3"]
QVSR$QV3_wrong[QVSR$S3 == "QV4"] <- QVSR$QV4[QVSR$S3 == "QV4"]
QVSR$QV3_wrong[QVSR$S3 == "QV5"] <- QVSR$QV5[QVSR$S3 == "QV5"]
QVSR$QV3_wrong[QVSR$S3 == "QV6"] <- QVSR$QV6[QVSR$S3 == "QV6"]

QVSR$QV4_wrong[QVSR$S4 == "QV1"] <- QVSR$QV1[QVSR$S4 == "QV1"]
QVSR$QV4_wrong[QVSR$S4 == "QV2"] <- QVSR$QV2[QVSR$S4 == "QV2"]
QVSR$QV4_wrong[QVSR$S4 == "QV3"] <- QVSR$QV3[QVSR$S4 == "QV3"]
QVSR$QV4_wrong[QVSR$S4 == "QV4"] <- QVSR$QV4[QVSR$S4 == "QV4"]
QVSR$QV4_wrong[QVSR$S4 == "QV5"] <- QVSR$QV5[QVSR$S4 == "QV5"]
QVSR$QV4_wrong[QVSR$S4 == "QV6"] <- QVSR$QV6[QVSR$S4 == "QV6"]

QVSR$QV5_wrong[QVSR$S5 == "QV1"] <- QVSR$QV1[QVSR$S5 == "QV1"]
QVSR$QV5_wrong[QVSR$S5 == "QV2"] <- QVSR$QV2[QVSR$S5 == "QV2"]
QVSR$QV5_wrong[QVSR$S5 == "QV3"] <- QVSR$QV3[QVSR$S5 == "QV3"]
QVSR$QV5_wrong[QVSR$S5 == "QV4"] <- QVSR$QV4[QVSR$S5 == "QV4"]
QVSR$QV5_wrong[QVSR$S5 == "QV5"] <- QVSR$QV5[QVSR$S5 == "QV5"]
QVSR$QV5_wrong[QVSR$S5 == "QV6"] <- QVSR$QV6[QVSR$S5 == "QV6"]

QVSR$QV6_wrong[QVSR$S6 == "QV1"] <- QVSR$QV1[QVSR$S6 == "QV1"]
QVSR$QV6_wrong[QVSR$S6 == "QV2"] <- QVSR$QV2[QVSR$S6 == "QV2"]
QVSR$QV6_wrong[QVSR$S6 == "QV3"] <- QVSR$QV3[QVSR$S6 == "QV3"]
QVSR$QV6_wrong[QVSR$S6 == "QV4"] <- QVSR$QV4[QVSR$S6 == "QV4"]
QVSR$QV6_wrong[QVSR$S6 == "QV5"] <- QVSR$QV5[QVSR$S6 == "QV5"]
QVSR$QV6_wrong[QVSR$S6 == "QV6"] <- QVSR$QV6[QVSR$S6 == "QV6"]



## Match #

# try match
#only keep unique values
#data_2 <- data_2 %>% distinct(matching_time, QV1_wrong, QV2_wrong, QV3_wrong, QV4_wrong, QV5_wrong,
#                                  QV6_wrong, credit.left, .keep_all = T) #drops 0
#QVSR <- QVSR %>% distinct(matching_time, QV1_wrong, QV2_wrong, QV3_wrong, QV4_wrong, QV5_wrong,
#                           QV6_wrong, credit.left, .keep_all = T) #drops 4



#### match cases between QVSR and other data
# match A and B with the strictest criteria (minutes), 
match_1 <- inner_join(data_2, QVSR) %>% 
  group_by(QV1_wrong, QV2_wrong, QV3_wrong, #inner join
                                QV4_wrong, QV5_wrong, QV6_wrong, #group by values you merged on
                                credit.left, matching_time_mins, matching_time_hours, matching_time_day) %>% 
  mutate(duplicated = n()) %>% #add count
  ungroup() 
table(match_1$duplicated) #observations lost. There should be enough matches here anyway! Otherwise something is wrong
match_1 <- match_1 %>% filter(duplicated == 1) #filter out cases that got double matched
unmatched_data2 <- anti_join(data_2, QVSR)
unmatched_qvsr <- anti_join(QVSR, data_2)


# Match on less strict (hours)
unmatched_data2 <- unmatched_data2 %>% select(-matching_time_mins)
unmatched_qvsr <- unmatched_qvsr %>%  select(-matching_time_mins)

match_2 <- inner_join(unmatched_data2, unmatched_qvsr) %>% 
  group_by(QV1_wrong, QV2_wrong, QV3_wrong, #inner join
           QV4_wrong, QV5_wrong, QV6_wrong, #group by values you merged on
           credit.left, matching_time_hours, matching_time_day) %>% 
  mutate(duplicated = n()) %>% #add count
  ungroup() 
table(match_2$duplicated) #observations lost
match_2 <- match_2 %>% filter(duplicated == 1) #filter out cases that got double matched
unmatched_data2 <- anti_join(unmatched_data2, unmatched_qvsr)
unmatched_qvsr <- anti_join(unmatched_qvsr, unmatched_data2)


# Match on less strict (day)
unmatched_data2 <- unmatched_data2 %>% select(-matching_time_hours)
unmatched_qvsr <- unmatched_qvsr %>%  select(-matching_time_hours)

match_3 <- inner_join(unmatched_data2, unmatched_qvsr) %>% 
  group_by(QV1_wrong, QV2_wrong, QV3_wrong, #inner join
           QV4_wrong, QV5_wrong, QV6_wrong, #group by values you merged on
           credit.left, matching_time_day) %>% mutate(duplicated = n()) %>% #add count
  ungroup() 
table(match_3$duplicated) #observations lost
match_3 <- match_3 %>% filter(duplicated == 1) #filter out cases that got double matched
unmatched_data2 <- anti_join(unmatched_data2, unmatched_qvsr)
unmatched_qvsr <- anti_join(unmatched_qvsr, unmatched_data2)




# Match on less strict (QVSR only)
unmatched_data2 <- unmatched_data2 %>% select(-matching_time_day)
unmatched_qvsr <- unmatched_qvsr %>%  select(-matching_time_day)

match_4 <- inner_join(unmatched_data2, unmatched_qvsr) %>% 
  group_by(QV1_wrong, QV2_wrong, QV3_wrong, #inner join
           QV4_wrong, QV5_wrong, QV6_wrong, #group by values you merged on
           credit.left) %>% mutate(duplicated = n()) %>% #add count
  ungroup() 
table(match_4$duplicated) #observations lost
match_4 <- match_4 %>% filter(duplicated == 1) #filter out cases that got double matched
unmatched_data2 <- anti_join(unmatched_data2, unmatched_qvsr)
unmatched_qvsr <- anti_join(unmatched_qvsr, unmatched_data2)




#merge the different matching dataframes together
match_1 <- match_1 %>% select(-matching_time_day, -matching_time_hours, -matching_time_mins)
match_2 <- match_2 %>% select(-matching_time_day, -matching_time_hours)
match_3 <- match_3 %>% select(-matching_time_day)

final_data <- full_join(match_1, match_2)
final_data <- full_join(final_data, match_3)
final_data <- full_join(final_data, match_4)

length(unique(final_data$rid))

final_data <- inner_join(final_data, data_1)
keep(final_data, sure=T)







#remove the people who raced through the survey
mean(final_data$`Duration (in seconds)`)
min(final_data$`Duration (in seconds)`)
max(final_data$`Duration (in seconds)`)
describe(final_data$`Duration (in seconds)`)
final_data <- subset(final_data, `Duration (in seconds)` > 100) # at least 100 seconds in part 2






#### Load conjoint data frames ####


## load conjoint data for the *choice* of profiles using the hainmueller package
# Cleaned in excel: only conjoint columns, ID variable should be first column. 
responses_choice <- c("QPT4.2", "QPT4.3", "QPT4.4", "QPT4.5", "QPT4.7", "QPT4.10",
               "QPT4.13", "QPT4.16", "QPT4.19", "QPT4.23") #variable names for answers. 


conjoint_df_choice <- read.qualtrics(filename="data/conjoint.csv", responses = responses_choice, 
                              respondentID = "rid", letter = "F", new.format = T) #user ID needs to be the first column!



# drop the columns I don't need #
drop <- c("respondentIndex", "We.moeten.klimaatverandering.hard.aanpakken.rowpos",
          "Scholen.mogen.homoseksualiteit.afkeuren.rowpos", "Megastallen.moeten.verboden.worden.rowpos",
          "Nederland.moet.meer.vluchtelingen.opnemen.rowpos", "Het.minimumloon.moet.omhoog.rowpos",
          "Er.moet.een.referendum.komen.over.de.EU.rowpos", "Partij.rowpos")
conjoint_df_choice <- conjoint_df_choice %>% select(-drop)
rm(drop)

conjoint_df_choice$task <- as.numeric(conjoint_df_choice$task) #change type and names so you can join later
conjoint_df_choice$profile <- as.numeric(conjoint_df_choice$profile)
conjoint_df_choice <- conjoint_df_choice %>% rename(rid = respondent)






## Load conjoint data for the *mechanism* variables
data_cjoint_mech <- read.csv("data/conjoint_mech.csv") #

#select the right columns
data_cjoint_mech <- data_cjoint_mech %>% select(rid, # response id
                                                QPT4.11_1, QPT4.11_2, QPT4.12_1, QPT4.12_2, #profile 6
                                                QPT4.14_1, QPT4.14_2, QPT4.15_1, QPT4.15_2, #profile 7
                                                QPT4.17_1, QPT4.17_2, QPT4.18_1, QPT4.18_2, #profile 8
                                                QPT4.20_1, QPT4.20_4, QPT4.21_1, QPT4.21_4, #profile 9
                                                QPT4.24_1, QPT4.24_4, QPT4.25_1, QPT4.25_4) #profile 10
                                                
#rename to make it possible to make the data longer
data_cjoint_mech <- data_cjoint_mech %>% rename(`6_1_position` = QPT4.11_1, 
                                                `6_2_position` = QPT4.11_2, 
                                                `6_1_clarity` = QPT4.12_1, 
                                                `6_2_clarity` = QPT4.12_2, 
                                                `7_1_position` = QPT4.14_1, 
                                                `7_2_position` = QPT4.14_2, 
                                                `7_1_clarity` = QPT4.15_1, 
                                                `7_2_clarity` = QPT4.15_2,
                                                `8_1_position` = QPT4.17_1, 
                                                `8_2_position` = QPT4.17_2, 
                                                `8_1_clarity` = QPT4.18_1, 
                                                `8_2_clarity` = QPT4.18_2,
                                                `9_1_position` = QPT4.20_1, 
                                                `9_2_position` = QPT4.20_4, 
                                                `9_1_clarity` = QPT4.21_1, 
                                                `9_2_clarity` = QPT4.21_4,
                                                `10_1_position` = QPT4.24_1, 
                                                `10_2_position` = QPT4.24_4, 
                                                `10_1_clarity` = QPT4.25_1, 
                                                `10_2_clarity` = QPT4.25_4)

                                                

# First turn into wide for position
data_position <- data_cjoint_mech %>% select(rid, ends_with("position")) #only take position
data_position <- data_position %>% pivot_longer(!rid, names_to="task_profile", values_to="position") #turn into wider
data_position <- data_position %>% mutate(task_profile = str_remove_all(task_profile, "_position")) # remove position ending
data_position <- data_position %>% separate(task_profile, c("task", "profile")) #separate the task and profile

# And for clarity
data_clarity <- data_cjoint_mech %>% select(rid, ends_with("clarity")) #only take clarity
data_clarity <- data_clarity %>% pivot_longer(!rid, names_to="task_profile", values_to="clarity") #turn into wider
data_clarity <- data_clarity %>% mutate(task_profile = str_remove_all(task_profile, "_clarity")) # remove position ending
data_clarity <- data_clarity %>% separate(task_profile, c("task", "profile")) #separate the task and profile

#Join the data together
data_cjoint_mech <- full_join(data_position, data_clarity)
data_cjoint_mech$task <- as.numeric(data_cjoint_mech$task) #change type so can join
data_cjoint_mech$profile <- as.numeric(data_cjoint_mech$profile)
rm(data_position, data_clarity)



#### Create the final data frame wide format ####
data_cjoint <- full_join(data_cjoint_mech, conjoint_df_choice) #join mechanism and choice outcomes

## Merge other part 2 and part 1 data in
final_data_wide <- inner_join(data_cjoint, final_data)




## Check results using Hainmueller
cjoint_design <- makeDesign(type="file", filename = "C:/Users/jonne/OneDrive/EUI/thesis/chapters/salience experiment/conjoint/02032021.dat")


x <- amce(selected ~ Er.moet.een.referendum.komen.over.de.EU + 
            Het.minimumloon.moet.omhoog +
            Nederland.moet.meer.vluchtelingen.opnemen +
            Megastallen.moeten.verboden.worden + 
            Scholen.mogen.homoseksualiteit.afkeuren +
            We.moeten.klimaatverandering.hard.aanpakken +
            Partij, data=final_data_wide, design= cjoint_design, respondent.id = "rid",
          cluster=TRUE)


summary(x)
cjoint::plot.amce(x, main="try plot")

# Rename column names
final_data_wide <- final_data_wide %>% rename(`EU`= Er.moet.een.referendum.komen.over.de.EU,
                                      `wage` = Het.minimumloon.moet.omhoog,
                                      `immi` = Nederland.moet.meer.vluchtelingen.opnemen,
                                      `farms` = Megastallen.moeten.verboden.worden,
                                      `lgbt` = Scholen.mogen.homoseksualiteit.afkeuren,
                                      `climate` = We.moeten.klimaatverandering.hard.aanpakken) %>%
  relocate(selected, .after=profile)





# Sage the wide data
#write_rds(final_data_wide, "data/final_data_wide.rds")






## Turn into a longer dataframe where the attributes are in a single column
# Rename column names
data_cjoint <- data_cjoint %>% rename(`EU`= Er.moet.een.referendum.komen.over.de.EU,
                                      `wage` = Het.minimumloon.moet.omhoog,
                                      `immi` = Nederland.moet.meer.vluchtelingen.opnemen,
                                      `farms` = Megastallen.moeten.verboden.worden,
                                      `lgbt` = Scholen.mogen.homoseksualiteit.afkeuren,
                                      `climate` = We.moeten.klimaatverandering.hard.aanpakken) %>%
  relocate(selected, .after=profile)


cols <- c("rid", "task", "profile", "selected", "Partij", "position", "clarity")
final_data_l <- data_cjoint %>% pivot_longer(!cols, names_to = "issue", values_to = "strategy")
rm(cols)



## Merge other part 2 and part 1 data in
final_data_l <- inner_join(final_data_l, final_data)
#write_rds(final_data_l, "data/final_data_long.rds")
keep(final_data, sure=T)
rm(final_data)




















